In this notebook, we perform a clustering analysis of the CD7 data on features extracted from SCIP.
%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use: %reload_ext autoreload
import warnings
from scip_workflows.common import *
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif
from sklearn.preprocessing import scale
import anndata
import scanpy
scanpy.settings.verbosity = 3
from sklearn.preprocessing import scale
import flowutils
import scipy.stats
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import fcluster
from scipy.spatial.distance import squareform
from kneed import KneeLocator
from scip.features import intensity
props = intensity.props.copy()
props.remove("kurtosis")
props.remove("skewness")
def asinh_scale(x, t):
return scale(flowutils.transforms.asinh(x, channel_indices=None, t=t, m=4.5, a=1), with_std=False)
plt.rcParams['figure.dpi'] = 150
try:
features = snakemake.input.features
index = snakemake.input.index
columns = snakemake.input.columns
fillna = bool(int(snakemake.wildcards.fillna))
output = snakemake.output[0]
except NameError:
# data_dir = Path("/data/gent/vo/000/gvo00070/vsc42015/datasets/cd7/800/results/scip/202203221745/")
data_dir = Path("/home/maximl/scratch/data/vsc/datasets/cd7/800/scip/061020221736/")
features = data_dir / "features.parquet"
index = data_dir / "indices" / "index.npy"
columns = data_dir / "indices" / "columns.npy"
fillna = False
output = data_dir / f"adata_{int(fillna)}.h5ad"
df = pq.read_table(features).to_pandas()
df = df.set_index(["meta_panel", "meta_replicate", "meta_P", "meta_id"])
df = df.loc["D"]
df = df[[c for c in numpy.load(columns, allow_pickle=True) if c in df.columns]]
df = df.loc[numpy.load(index, allow_pickle=True)]
df = df.sort_index()
df.shape
(29927, 1188)
from sklearn.feature_selection import VarianceThreshold
var = VarianceThreshold().fit(df.filter(regex="feat"))
df = pandas.concat([
df.filter(regex="feat").iloc[:, var.get_support()],
df.filter(regex="meta")
], axis=1)
df.isna().all(axis=0).any()
False
df.filter(regex="feat").isna().all(axis=1).sum()
0
seaborn.histplot(data=df.isna().sum())
<AxesSubplot:ylabel='Count'>
if fillna:
df = df.fillna(0)
else:
df = df.drop(columns=df.columns[df.isna().sum() > 0])
obs = df.filter(regex='meta').reset_index()
obs.index = df.index
adata = anndata.AnnData(df.filter(regex="feat").astype(numpy.float32), obs=obs)
adata.raw = adata.copy()
adata.obs["meta_replicate"] = adata.obs["meta_replicate"].astype("category")
markers = [col for col in adata.var.index if col.startswith("feat_sum")]
adata_pre = adata.copy()
%%time
scanpy.pp.scale(adata_pre)
scanpy.tl.pca(adata_pre, svd_solver='arpack')
scanpy.pp.neighbors(adata_pre, n_neighbors=30)
scanpy.tl.umap(adata_pre)
scanpy.pl.umap(adata_pre, color=["meta_replicate"])
computing PCA
with n_comps=50
finished (0:00:00)
computing neighbors
using 'X_pca' with n_pcs = 50
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:08)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:00:45)
CPU times: user 2min, sys: 11.8 s, total: 2min 12s Wall time: 55.6 s
discrete = ["median", "area", "euler"]
discrete_cols = [c for c in adata.var_names if any(d in c for d in discrete)]
discrete_cols_i = [i for i, c in enumerate(adata.var_names) if any(d in c for d in discrete)]
adata.var["is_marker"] = [any(n.endswith("feat_combined_sum_%s" % m) for m in ["DAPI", "EGFP", "RPe", "APC"]) for n in adata.var_names]
adata.var["do_asinh"] = [(any(m in n for m in ["DAPI", "EGFP", "RPe", "APC"]) and any(o in n for o in props)) for n in adata.var_names]
%%time
with warnings.catch_warnings():
warnings.simplefilter("ignore")
sc_df = scanpy.get.obs_df(adata, keys=adata.var_names.to_list())
sc_df[adata.var_names[adata.var.do_asinh].to_list()] = sc_df[adata.var_names[adata.var.do_asinh]].groupby(["meta_replicate", "meta_P"]).transform(lambda x: asinh_scale(x, x.max()))
sc_df[adata.var_names[~adata.var.do_asinh].to_list()] = sc_df[adata.var_names[~adata.var.do_asinh]].groupby(["meta_replicate", "meta_P"]).transform(lambda x: scale(x))
adata = anndata.AnnData(X=sc_df, obs=adata.obs, var=adata.var, raw=adata.raw)
CPU times: user 1min 26s, sys: 3.08 s, total: 1min 29s Wall time: 1min 28s
def map_names(a):
return {
"feat_combined_sum_DAPI": "DAPI",
"feat_combined_sum_EGFP": "CD45",
"feat_combined_sum_RPe": "Siglec 8",
"feat_combined_sum_APC": "CD15"
}[a]
aligned_df = scanpy.get.obs_df(adata, keys=adata.var_names[adata.var.is_marker].to_list()).reset_index()
melted_df = pandas.melt(aligned_df, id_vars=["meta_P", "meta_replicate"], value_vars=adata.var_names[adata.var.is_marker].to_list())
melted_df.variable = melted_df.variable.apply(map_names)
grid = seaborn.FacetGrid(data=melted_df, col="meta_replicate", row="variable", sharey="row", aspect=1.5, margin_titles=True)
grid.map_dataframe(seaborn.stripplot, x="meta_P", y="value", size=1, alpha=0.5)
grid.set_axis_labels("Well image position", "Fluorescence intensity")
grid.set_titles(col_template="Replicate {col_name}", row_template="{row_name}")
grid.add_legend()
# plt.savefig(data_dir / "figures/qc_intensity_distribution_post.png", bbox_inches='tight', pad_inches=0)
<seaborn.axisgrid.FacetGrid at 0x7f0ff9662d30>
def rediscritize(v):
bin_idx = numpy.digitize(v, bins=numpy.histogram_bin_edges(v))
bin2mu = [numpy.mean(v[bin_idx == i]) for i in range(1, numpy.max(bin_idx)+1)]
return numpy.fromiter((bin2mu[i-1] for i in bin_idx), dtype=float)
adata = adata[adata.to_df().isna().sum(axis=1) == 0].copy()
X = adata.X.copy()
X[:, discrete_cols_i] = numpy.apply_along_axis(rediscritize, 0, X[:, discrete_cols_i])
/srv/scratch/maximl/mambaforge/envs/scip/lib/python3.9/site-packages/numpy/core/fromnumeric.py:3474: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype, /srv/scratch/maximl/mambaforge/envs/scip/lib/python3.9/site-packages/numpy/core/_methods.py:189: RuntimeWarning: invalid value encountered in true_divide ret = ret.dtype.type(ret / rcount)
%%time
with warnings.catch_warnings():
warnings.simplefilter("ignore")
mi_post = mutual_info_classif(X=X, y=adata.obs["meta_replicate"], discrete_features=discrete_cols_i, n_neighbors=30, random_state=0)
mi_post = pandas.Series(mi_post, index=adata.var_names).sort_values(ascending=False)
CPU times: user 4min 12s, sys: 8.11 s, total: 4min 20s Wall time: 4min 20s
kneedle = KneeLocator(numpy.arange(len(mi_post)), mi_post, S=40, curve='convex', direction="decreasing",online=False)
elbow_value = mi_post.iloc[int(kneedle.knee)]
kneedle.plot_knee()
selected_mi = mi_post[mi_post < elbow_value].index.values
len(selected_mi) / len(mi_post)
0.8407643312101911
len(mi_post[mi_post > elbow_value].index.values)
149
adata2 = adata[:, selected_mi].copy()
corr = scipy.stats.spearmanr(adata2.X).correlation
seaborn.clustermap(corr, vmin=-1, vmax=1, center=0)
/srv/scratch/maximl/mambaforge/envs/scip/lib/python3.9/site-packages/seaborn/matrix.py:657: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance. warnings.warn(msg)
<seaborn.matrix.ClusterGrid at 0x7f0ff8e89e50>
corr = (corr + corr.T) / 2
numpy.fill_diagonal(corr, 1)
def feat_from_corr(corr):
# We convert the correlation matrix to a distance matrix before performing
# hierarchical clustering using Ward's linkage.
distance_matrix = 1 - numpy.abs(corr)
dist_linkage = hierarchy.ward(squareform(distance_matrix))
clusters = fcluster(dist_linkage, 0.1, criterion="distance").ravel()
indices = []
for c in numpy.unique(clusters):
ci = numpy.flatnonzero(clusters == c)
indices.append(ci[adata2.X[:, ci].std(axis=0).argmax()])
features = [adata2.var_names[i] for i in indices]
return features, indices, clusters
features, indices, clusters = feat_from_corr(corr)
seaborn.clustermap(corr[indices, :][:, indices], vmin=-1, vmax=1, center=0)
<seaborn.matrix.ClusterGrid at 0x7f0ff90a3070>
adata2.var["selected_corr"] = False
adata2.var.loc[features, "selected_corr"] = True
adata2.var["feature_clusters"] = clusters
scanpy.pp.scale(adata2)
adata3 = adata2[:, adata2.var.selected_corr].copy()
scanpy.tl.pca(adata3, svd_solver='arpack', random_state=0)
scanpy.pp.neighbors(adata3, n_neighbors=30, method="umap", random_state=0)
computing PCA
with n_comps=50
finished (0:00:00)
computing neighbors
using 'X_pca' with n_pcs = 50
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:10)
%%time
resolutions = [0.1, 0.25, 0.5, 0.75, 1]
for res in resolutions:
scanpy.tl.leiden(adata3, resolution=res, key_added=f"leiden_{res}", random_state=0)
running Leiden clustering
finished: found 3 clusters and added
'leiden_0.1', the cluster labels (adata.obs, categorical) (0:00:07)
running Leiden clustering
finished: found 4 clusters and added
'leiden_0.25', the cluster labels (adata.obs, categorical) (0:00:10)
running Leiden clustering
finished: found 7 clusters and added
'leiden_0.5', the cluster labels (adata.obs, categorical) (0:00:36)
running Leiden clustering
finished: found 10 clusters and added
'leiden_0.75', the cluster labels (adata.obs, categorical) (0:00:26)
running Leiden clustering
finished: found 12 clusters and added
'leiden_1', the cluster labels (adata.obs, categorical) (0:00:25)
CPU times: user 1min 43s, sys: 3.64 s, total: 1min 47s
Wall time: 1min 46s
scanpy.tl.umap(adata3, random_state=0)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:00:45)
scanpy.pl.umap(adata3, color=["leiden_0.25", "leiden_0.5", "leiden_0.75", "leiden_1", "meta_replicate"], legend_loc='on data')
adata3.obs["leiden"] = adata3.obs["leiden_0.75"]
seaborn.countplot(data=adata3.obs, x="leiden", hue="meta_replicate")
<AxesSubplot:xlabel='leiden', ylabel='count'>
adata_out = anndata.AnnData(
X=adata2.X,
var=adata2.var,
obs=adata3.obs,
obsm=adata3.obsm,
raw=adata2.raw
)
import pickle
with open(data_dir / "adata.pickle", "wb") as fh:
pickle.dump(adata_out, fh)